Read in and clean data
# ss is csv file with school shooting data (individual incidents)
ss = read_csv("ss_data.csv")%>%
select(-X1)%>%
mutate(year = as.numeric(substr(date, (nchar(date)-3),nchar(date))))%>%
mutate(state = ifelse(state == "St. Croix, US Virgin Islands","St.Croix",state))%>%
distinct_at(.vars = c(1:24), .keep_all = T)%>%
filter(category != "Accident")%>%
filter(reliability_score_1_5!=1)
Population data
# data set to go from state names to abbreviations
states = data.frame(state.name,state.abb)
dc = data.frame(matrix(ncol= 2, nrow = 1))
colnames(dc)=colnames(states)
dc$`state.name` = "District of Columbia"
dc$`state.abb` = "DC"
states = rbind(states,dc)
states$state.name = as.character(states$state.name)
if(import_raw_data){
# 1980-1990
pop8090_raw = read.delim("https://www2.census.gov/programs-surveys/popest/tables/1980-1990/state/asrh/st8090ts.txt" )%>%data.frame()
pop8090_raw = pop8090_raw[c(8:58,64:114),]%>%data.frame()
colnames(pop8090_raw)="var"
pop8090 = data.frame(pop8090_raw$var[1:51],pop8090_raw$var[52:102])
colnames(pop8090)=c("var1","var2")
pop8090 = pop8090%>%
separate(var1, c("state",as.character(seq(1980,1984,by = 1))), extra = "merge")%>%
separate(var2, c("state1",as.character(seq(1985,1990,by = 1))), extra = "merge")%>%
select(c(state,starts_with("19")))
# 1970 - 1980
pop7080_raw = read.delim("https://www2.census.gov/programs-surveys/popest/tables/1980-1990/state/asrh/st7080ts.txt")
pop7080_raw = pop7080_raw[c(10:61,63:114),]%>%data.frame()
colnames(pop7080_raw)="var"
pop7080 = data.frame(pop7080_raw$var[1:51],pop7080_raw$var[53:103])
colnames(pop7080)=c("var1","var2")
pop7080 = pop7080%>%
separate(var1, c("blnk","id","state","1970","1971","1972","1973","1974","1975"), extra = "merge")%>%
separate(var2, c("blnk1","id1","state1",as.character(seq(1976,1980,by = 1))), extra = "merge")%>%
select(c(state,starts_with("19")))
# 1990-2000
pop90_00 = read_csv("../../data/pop_90-00.csv")%>%
rename(state = X1)%>%
select(c(state,3:13))%>%
filter(state != "USA")
colnames(pop90_00)[2:ncol(pop90_00)]=as.character(seq(1990,2000,by = 1))
pop90_00$state[30:35] = states$state.name[29:34]
pop90_00$state[40:42] = states$state.name[39:41]
pop90_00$state[9] = states$state.name[51]
pop90_00$state[49] = states$state.name[48]
# 2000-2010
pop00_10 = read.csv("https://www2.census.gov/programs-surveys/popest/datasets/2000-2010/intercensal/state/st-est00int-agesex.csv")%>%
clean_names()%>%
filter(sex == 0, age == 999)%>%
select(c(name,starts_with("popestimate")))%>%
rename_at(vars(starts_with("popestimate")),funs(substr(.,start = 12, stop = 15)))%>%
rename(state = name)%>%
mutate(state = as.character(state))%>%
mutate(state = ifelse(state=="District of Columbia",as.character(states$state.name[51]),state))%>%
filter(state != "United States")
# 2010-2018
pop10_18 = read_csv("../../data/pop_2010-2018.csv")
colnames(pop10_18)=pop10_18[1,]
pop10_18 = pop10_18%>%
slice(-1)%>%
rename_at(vars(starts_with("Pop")), funs(substr(.,start = 38,stop = 41)))%>%
rename(state = Geography)%>%
select(c(state,starts_with("20")))%>%
mutate(state = ifelse(state=="District of Columbia",as.character(states$state.name[51]),state))
# 2019
pop2019 = read_csv("../../data/2019pop.csv")%>%
clean_names()%>%
select(c(state,pop))%>%
rename("2019"=pop)
pops = left_join(pop7080,pop8090%>%select(-c("1980")), by = "state")%>%
left_join(states,by = c("state"="state.abb"))%>%
mutate(state = state.name)%>%
select(-state.name)%>%
left_join(pop90_00%>%select(-c("1990")), by = "state")%>%
left_join(pop00_10%>%select(-c("2000")), by = "state")%>%
left_join(pop10_18%>%select(-c("2010")), by = "state")%>%
left_join(pop2019,by = "state")%>%
gather(key = "year", value = "population", c(2:51))
write.csv(pops, "state_populations.csv")
}
Read in population data
# read in population data (include abbreviated and full state name)
pops = read_csv("state_populations.csv")%>%
select(-X1)%>%
left_join(states, by = c("state"="state.name"))
Incident count format
sts = unique(ss$state)
yrs = seq(range(ss$year)[1],range(ss$year)[2],by=1)
st_yr = data.frame(state = rep(sts,length(yrs)), year = sort(rep(yrs,length(sts))))%>%
arrange(state)
## data set aggrgated by state and year
ss_counts = ss%>%
group_by(state,year) %>%
summarize(incident_count = n(),
total_victims = sum(total_injured_killed_victims),
total_fatalities = sum(killed_includes_shooter),
total_wounded = sum(wounded),
avg_victims = mean(total_injured_killed_victims),
avg_fatalities = mean(killed_includes_shooter),
ave_wounded = mean(wounded),
avg_shooter_age = mean(shooter_age),
max_shooter_age = max(shooter_age),
min_shooter_age = min(shooter_age),
avg_reliability = mean(reliability_score_1_5),
target = mean(ifelse(targeted_specific_victim_s=="Y",1,
ifelse(targeted_specific_victim_s=="N",0,NA)),na.rm = T),
random_vict = mean(ifelse(random_victims=="Y",1,
ifelse(random_victims=="N",0,NA)),na.rm = T),
bullied = mean(ifelse(bullied_y_n_n_a=="Y",1,
ifelse(bullied_y_n_n_a=="N",0,NA)),na.rm = T),
domestic_violence = mean(ifelse(domestic_violence_y_n=="Y",1,
ifelse(domestic_violence_y_n=="N",0,NA)),na.rm = T))%>%
mutate(in_ss = TRUE)%>%
full_join(st_yr, by = c("state","year"))%>%
left_join(pops%>%rename(full_state_name = state), by = c("state"="state.abb", "year"))%>%
mutate(incident_count = ifelse(is.na(incident_count),0,incident_count))%>%
mutate(in_ss = ifelse(is.na(in_ss),FALSE,in_ss))
Grade data
# read in state grade data
if(import_raw_data){
grades18 = read_csv("../../data_1/2018grades.csv")%>%
select(-c(X6,X7))%>%
clean_names()%>%
select(-gun_death_rate_per_100k)%>%
mutate(year = "2018")%>%
rename(grade = x2018_grade)%>%
select(state,grade,year)
grades17 = read_csv("../../data_1/2017grades.csv")%>%
clean_names()%>%
mutate(year = "2017")%>%
rename(grade = x2017_grade)%>%
select(state,grade,year)
grades16 = read_csv("../../data_1/2016grades.csv")%>%
clean_names()%>%
mutate(year = "2016")%>%
rename(grade = x2016_grade)%>%
select(state,grade,year)
grades15 = read_csv("../../data_1/2015grades.csv")%>%
clean_names()%>%
mutate(year = "2015")%>%
rename(grade = x2015_grade)%>%
select(state,grade,year)
# add in missing value
grades15 = rbind(grades15,c("Kansas","F",2015))
grades14 = read_csv("../../data_1/2014grades.csv")%>%
clean_names()%>%
mutate(year = "2014")%>%
rename(grade = x2014_grade)%>%
select(state,grade,year)
grades13 = read_csv("../../data_1/2013grades.csv")%>%
clean_names()%>%
mutate(year = "2013")%>%
rename(grade = x2013_grade)%>%
select(state,grade,year)
grades12 = read_csv("../../data_1/2012grades.csv")%>%
clean_names()%>%
mutate(year = "2012")%>%
rename(grade = x2012_grade)%>%
select(state,grade,year)
# combine all year data
grades = bind_rows(grades18,grades17)%>%
bind_rows(grades16)%>%
bind_rows(grades15)%>%
bind_rows(grades14)%>%
bind_rows(grades13)%>%
bind_rows(grades12)
write.csv(grades,"state_grades.csv")
}
Read in grade data
grades = read_csv("state_grades.csv")%>%
select(-X1)
gpa_convert = data.frame(letter_grade = c("A","A-","B+","B","B-",
"C+","C","C-","D+","D","D-","F"),
gpa = c(4,3.7,3.3,3,2.7,2.3,2,1.7,1.3,1,0.7,0))
ss_counts = ss_counts%>%
left_join(grades%>%select(state,grade,year),by = c("full_state_name"="state","year"))%>%
left_join(gpa_convert, by = c("grade"="letter_grade"))
Poverty data
pov=read.csv("census_poverty_data.csv",stringsAsFactors = FALSE)
pov=filter(pov,State!=" United States"&State!=" United States"& State!="District of Columbia"&State!="United States")
names(pov)[1:2]<-c("state","poverty")
names(pov)[4]<-"year"
skel=expand_grid(sort(unique(pov$state)),min(pov$year):(max(pov$year)+2))
names(skel)<-c("state","year")
sk1=filter(skel,year==2007|year==2008|year==2009)
sk1=left_join(sk1,filter(pov,year==2007),by="state")
sk2=filter(skel,year==2010|year==2011|year==2012)
sk2=left_join(sk2,filter(pov,year==2010),by="state")
sk3=filter(skel,year==2013|year==2014|year==2015)
sk3=left_join(sk3,filter(pov,year==2013),by="state")
sk4=filter(skel,year==2016|year==2017|year==2018)
sk4=left_join(sk4,filter(pov,year==2016),by="state")
pov=rbind(sk1,sk2,sk3,sk4)
pov=pov[,1:3]
names(pov)<-c("state","year","poverty")
Merge poverty data
pov = left_join(pov,states, by = c("state"="state.name"))%>%
select(-state)%>%
rename(state = state.abb)
ss_counts = left_join(ss_counts, pov, by = c("state","year"))
Bullying data
bully = read_csv("State_bullying.csv")%>%
select(c(total_score,state.abb))%>%
rename(state = state.abb)%>%
mutate(year = 2010)
ss_counts = left_join(ss_counts, bully%>%select(-year), by = "state")%>%
rename(bullying_score = total_score)%>%
mutate(bullying_score = as.numeric(bullying_score))
Mental Health data
mh.data=read.csv("mh.data.csv",stringsAsFactors = FALSE)%>%
clean_names()%>%
select(-x)%>%
left_join(states, by = c("state"="state.name"))%>%
select(-state)%>%
rename(state = state.abb)
ss_counts = ss_counts%>%
left_join(mh.data, by = c("state","year"))%>%
rename(mh_score = percent)
## Warning: Column `state` joining character vector and factor, coercing into
## character vector
CDC Bullying Data
qns = c("qn13","qn14","qn15","qn16","qn17","qn18","qn19","qn20","qn24","qn25","qn26","qn27","qn28","qn29","qn30","qnbullyweight","qnbullygay")
sts = yrbss::getListOfParticipatingStates()
yrs = c(1991,1993,1995,1997,1999,2001,2003,2005,2007,2009,2011,2013,2015)
if(import_raw_data){
bullying = data.frame(variable = NULL, state = NULL, year = NULL, prop= NULL, ciLB= NULL, ciUB = NULL, n = NULL)
for(v in qns){
for(s in sts){
for(y in yrs ){
p = yrbss::getProportionSingleVariable(.variable = v, .location = s, .year = y, .level = 0.95)$prop
ciL = yrbss::getProportionSingleVariable(.variable = v, .location = s, .year = y, .level = 0.95)$ciLB
ciU = yrbss::getProportionSingleVariable(.variable = v, .location = s, .year = y, .level = 0.95)$ciUB
sample_size = yrbss::getProportionSingleVariable(.variable = v, .location = s, .year = y, .level = 0.95)$n
newrow = data.frame(variable = v, state = s, year = y, prop= p, ciLB= ciL, ciUB = ciU, n = sample_size)
bullying = bind_rows(bullying,newrow)}
}}
lab = data.frame(question = yrbss::yrbss_questions_binary$variable, label = yrbss::yrbss_questions_binary$label)%>%
filter(question %in% qns)
bullying = left_join(bullying,lab, by = c(variable = "question"))
bullying$state[which(bullying$state=="AZB")]="AZ"
write_csv(bullying, "bullying_survey.csv")
}
lab = data.frame(question = yrbss::yrbss_questions_binary$variable, label = yrbss::yrbss_questions_binary$label)%>%
filter(question %in% qns)
# read in csv
bullying_survey = read_csv("bullying_survey.csv")
## Parsed with column specification:
## cols(
## variable = col_character(),
## state = col_character(),
## year = col_double(),
## prop = col_double(),
## ciLB = col_double(),
## ciUB = col_double(),
## n = col_double(),
## label = col_character()
## )
# check missingness for 2009-2015
sum(!complete.cases(filter(bullying_survey, year>=2007)))/nrow(filter(bullying_survey, year>=2007))
## [1] 0.3135135
### We're going to need to impute values for missing years and states for some of these variables if these data are going to be useful in our analysis
Make combined variables for survey questions
# combine suicide questions
suicide = data.frame(year = NULL, state = NULL, score = NULL, n = NULL)
suicide_qns = bullying_survey%>%
filter(variable %in% c("qn26","qn27","qn28","qn29","qn30"))
sts[which(sts=="AZB")]="AZ"
for(y in yrs){
for(st in sts){
dat = filter(suicide_qns,(year ==y & state == st))
suicide_score = weighted.mean(dat$prop,(dat$n/sum(dat$n)), na.rm = TRUE)
samp = mean(dat$n,na.rm = TRUE)
newrow = data.frame(year = y, state = st, score = suicide_score, n = samp)
suicide = bind_rows(suicide,newrow)
}
}
# combine questions involving physical fights
fight = data.frame(year = NULL, state = NULL, score = NULL, n = NULL)
fight_qns = bullying_survey%>%
filter(variable %in% c("qn18","qn19","qn20"))
for(y in yrs){
for(st in sts){
dat = filter(fight_qns,(year ==y & state == st))
fight_score = weighted.mean(dat$prop,(dat$n/sum(dat$n)),, na.rm = TRUE)
samp = mean(dat$n,na.rm = TRUE)
newrow = data.frame(year = y, state = st, score = fight_score, n = samp)
fight = bind_rows(fight,newrow)
}
}
# school safety questions
safety = data.frame(year = NULL, state = NULL, score = NULL, n = NULL)
safety_qns = bullying_survey%>%
filter(variable %in% c("qn13","qn14","qn15","qn16","qn17"))
for(y in yrs){
for(st in sts){
dat = filter(safety_qns,(year ==y & state == st))
safety_score = weighted.mean(dat$prop,(dat$n/sum(dat$n)), na.rm = TRUE)
samp = mean(dat$n,na.rm = TRUE)
newrow = data.frame(year = y, state = st, score = safety_score, n = samp)
safety = bind_rows(safety,newrow)
}
}
# bullying questions
bull = data.frame(year = NULL, state = NULL, score = NULL, n = NULL)
bully_qns = bullying_survey%>%
filter(variable %in% c("qn24","qn25","qnbullyweight","qnbullygay"))
for(y in yrs){
for(st in sts){
dat = filter(bully_qns,(year ==y & state == st))
bully_score = weighted.mean(dat$prop,(dat$n/sum(dat$n)), na.rm = TRUE)
samp = mean(dat$n,na.rm = TRUE)
newrow = data.frame(year = y, state = st, score = bully_score, n = samp)
bull = bind_rows(bull,newrow)
}
}
survey_combined = bind_cols(suicide,safety, fight, bull)%>%
select(year, state, score, n, score1, n1, score2, n2, score3, n3)%>%
rename(suicide_score = score, school_safety_score = score1, fight_score = score2, bully_score = score3)%>%
filter(year>=2007)%>%
full_join(st_yr%>%filter(year>=2007))%>%
arrange(year)%>%
arrange(state)%>%
mutate(state = ifelse(state == "AZB","AZ",state))
survey_combined = survey_combined%>%
group_by(state)%>%
mutate(suicide_score =ifelse(year<2016,ifelse(is.na(suicide_score), na.locf(suicide_score)/2+na.locf(suicide_score, fromLast = TRUE)/2,suicide_score),
NA ))%>%
mutate(school_safety_score =ifelse(year<2016,ifelse(is.na(school_safety_score), na.locf(school_safety_score)/2+na.locf(school_safety_score, fromLast = TRUE)/2,school_safety_score),NA))%>%
mutate(fight_score =ifelse(year<2016,ifelse(is.na(fight_score), na.locf(fight_score)/2+na.locf(fight_score, fromLast = TRUE)/2,fight_score),NA))%>%
mutate(bully_score =ifelse(year<2016,ifelse(is.na(bully_score), na.locf(bully_score)/2+na.locf(bully_score, fromLast = TRUE)/2,bully_score),NA))%>%
mutate(n=ifelse(year<2016,ifelse(is.na(n), na.locf(n)/2+na.locf(n, fromLast = TRUE)/2,n),NA))%>%
mutate(n1=ifelse(year<2016,ifelse(is.na(n1), na.locf(n1)/2+na.locf(n1, fromLast = TRUE)/2,n1),NA))%>%
mutate(n2=ifelse(year<2016,ifelse(is.na(n2), na.locf(n2)/2+na.locf(n2, fromLast = TRUE)/2,n2),NA))%>%
mutate(n3=ifelse(year<2016,ifelse(is.na(n3), na.locf(n3)/2+na.locf(n3, fromLast = TRUE)/2,n3),NA))
# use fitted line to get 2016-2019 values
for(s in sts[which(!(sts %in% c("MA","AZB")))]){
dat = filter(survey_combined, state ==s)
ind = which(is.na(dat$suicide_score))[1]
slp = dat[ind-1,3:10]-dat[ind-2,3:10]
dat[which(is.na(dat$suicide_score))[1],3:10] = dat[(ind-1),3:10]+as.numeric(dat[which(is.na(dat$suicide_score))[1],1]-2015)*slp
dat[which(is.na(dat$suicide_score))[1],3:10] = dat[(ind-1),3:10]+as.numeric(dat[which(is.na(dat$suicide_score))[1],1]-2015)*slp
dat[which(is.na(dat$suicide_score))[1],3:10] = dat[(ind-1),3:10]+as.numeric(dat[which(is.na(dat$suicide_score))[1],1]-2015)*slp
dat[which(is.na(dat$suicide_score))[1],3:10] = dat[(ind-1),3:10]+as.numeric(dat[which(is.na(dat$suicide_score))[1],1]-2015)*slp
survey_combined[which(survey_combined$state == s),]=dat
}
# chech to see if mean and median are different
survey_combined%>%
group_by(year)%>%
summarise(med_ss = median(suicide_score, na.rm = T), mn_ss = mean(suicide_score, na.rm = T),
med_safe = median(school_safety_score, na.rm = T), mn_safe = mean(school_safety_score, na.rm = T),
med_f = median(fight_score, na.rm = T), mn_f = mean(fight_score, na.rm = T),
med_b = median(bully_score, na.rm = T), mn_b = mean(bully_score, na.rm = T))
## # A tibble: 13 x 9
## year med_ss mn_ss med_safe mn_safe med_f mn_f med_b mn_b
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2007 0.133 0.130 0.0908 0.0880 0.152 0.154 0.195 0.198
## 2 2008 0.128 0.131 0.0874 0.0866 0.151 0.153 0.195 0.198
## 3 2009 0.132 0.132 0.0839 0.0847 0.144 0.152 0.195 0.198
## 4 2010 0.131 0.132 0.0823 0.0847 0.144 0.145 0.188 0.186
## 5 2011 0.129 0.133 0.0826 0.0838 0.137 0.142 0.186 0.187
## 6 2012 0.136 0.139 0.0822 0.0850 0.126 0.134 0.182 0.186
## 7 2013 0.143 0.143 0.0841 0.0888 0.120 0.125 0.183 0.187
## 8 2014 0.147 0.146 0.0844 0.0864 0.108 0.120 0.189 0.185
## 9 2015 0.152 0.149 0.0809 0.0831 0.104 0.113 0.189 0.185
## 10 2016 0.156 0.152 0.0754 0.0798 0.104 0.106 0.189 0.184
## 11 2017 0.156 0.155 0.0746 0.0765 0.0993 0.0993 0.189 0.184
## 12 2018 0.154 0.157 0.0713 0.0731 0.0965 0.0925 0.189 0.184
## 13 2019 0.156 0.160 0.0712 0.0698 0.0933 0.0857 0.189 0.184
# impute missing values with median value for each column
survey_combined = survey_combined%>%
ungroup()%>%
group_by(year)%>%
mutate(suicide_score = ifelse(is.na(suicide_score), median(suicide_score, na.rm = T),suicide_score))%>%
mutate(school_safety_score = ifelse(is.na(school_safety_score), median(school_safety_score, na.rm = T),school_safety_score))%>%
mutate(fight_score = ifelse(is.na(fight_score), median(fight_score, na.rm = T),fight_score))%>%
mutate(bully_score = ifelse(is.na(bully_score), median(bully_score, na.rm = T),bully_score))%>%
mutate(n = ifelse(is.na(n), median(n, na.rm = T),n))%>%
mutate(n1 = ifelse(is.na(n1), median(n1, na.rm = T),n1))%>%
mutate(n2 = ifelse(is.na(n2), median(n2, na.rm = T),n2))%>%
mutate(n3 = ifelse(is.na(n3), median(n3, na.rm = T),n3))%>%
mutate(school_safety_score = ifelse(school_safety_score<0,0,school_safety_score))%>%
mutate(fight_score = ifelse(fight_score<0,0,fight_score))
ss_counts = left_join(ss_counts, survey_combined, by = c("state","year"))%>%
filter(state != "DC")
rand.count.data = read_csv("cleaned.rand.csv")%>%
select(-X1)%>%
left_join(states, by = c("state"= "state.name"))%>%
select(-state)%>%
rename(state = state.abb)
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_double(),
## state = col_character()
## )
## See spec(...) for full column specifications.
ss_counts = left_join(ss_counts,rand.count.data, by = c("state","year"))
## Warning: Column `state` joining character vector and factor, coercing into
## character vector
ss_counts = ss_counts%>%
mutate(access_laws = -1*(age_permissive-age_restrictive+child_access_permissive-child_access_restrictive+prohibited_possessor_permissive-prohibited_possessor_restrictive))%>%
mutate(transport= -1*(castle_doctrine_permissive-castle_doctrine_restrictive+open_carry_permissive-open_carry_restrictive+carrying_permissive-carrying_permissive))%>%
mutate(screening = -1*(background_check_permissive- background_check_restrictive + waiting_period_permissive- waiting_period_restrictive + permit_permissive - permit_restrictive + registration_permissive - registration_restrictive + safety_training_permissive - safety_training_restrictive + dealer_license_permissive - dealer_license_restrictive + one_gun_per_permissive - one_gun_per_restrictive))%>%
mutate(ban = -ban_permissive+ban_restrictive)%>%
select(-c(36:63))
EDA
table(ss$reliability_score_1_5)%>%
data.frame()%>%
ggplot(aes(x = Var1, y = Freq))+
geom_bar(aes(fill = Var1), stat = "identity")+
scale_fill_discrete(name = "Reliability Score")+
labs(x = "Reliability Score", y = "Count")

table(ss$school_type)%>%
data.frame()%>%
ggplot(aes(x = Var1, y = Freq))+
geom_bar(aes(fill = Var1), stat = "identity")+
scale_fill_discrete(name = "School Type")+
labs(x = "School Type", y = "Count")

ss%>%
filter(shooter_age<200)%>%
ggplot(aes(x = reorder(state, shooter_age, FUN = mean), y = shooter_age))+
geom_boxplot()+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6))+
labs(y = "Age of shooter", x = "State", title = "Age of Shooter by State")

ss%>%
ggplot(aes(x = reorder(state, killed_includes_shooter, FUN = mean), y = killed_includes_shooter))+
geom_boxplot()+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6))+
labs(y = "Numbeer Killed (including shooter)", x = "State", title = "Fatalities")

ss%>%
ggplot(aes(x = reorder(state, wounded, FUN = mean), y = total_injured_killed_victims))+
geom_boxplot()+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6))+
labs(y = "Wounded ", x = "State", title = "Number Wounded")

ss%>%
ggplot(aes(x = reorder(state, total_injured_killed_victims, FUN = mean), y = total_injured_killed_victims))+
geom_boxplot()+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6))+
labs(y = "Wounded or Killed", x = "State", title = "Combined Wounded & Fatalities")

table(ss$state)%>%
data.frame()%>%
ggplot(aes(x = Var1, y = Freq))+
geom_bar(aes(x = reorder(Var1,Freq, desc)), stat = "identity")+
labs(x = "State", y = "Count", "Number of incidents by State (overall)")+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6))

Grade plots
cc = scales::seq_gradient_pal("green", "red", "Lab")(seq(0,1,length.out=12))
cc2 = scales::seq_gradient_pal("green", "red", "Lab")(seq(0,1,length.out=5))
# This will handle the ordering
d <- ss_counts %>%
filter(year %in% seq(2012,2018,by=1))%>%
mutate(grade = substr(grade,1,1))%>%
ungroup() %>% # As a precaution / handle in a separate .grouped_df method
arrange(year, grade) %>% # arrange by facet variables and continuous values
mutate(.r = row_number()) # Add a row number variable
ggplot(d, aes(x = .r, y= incident_count, fill = grade)) + # Use .r instead of x
geom_point(data = d%>%filter(incident_count==0),
aes(x = .r, y= incident_count, color = grade),
size = 0.7) +
geom_bar(stat = "identity")+
facet_wrap(~ year, scales = "free_x") + # Should free scales (though could be left to user)
scale_x_continuous( # This handles replacement of .r for x
breaks = d$.r, # notice need to reuse data frame
labels = d$state
)+
labs(title = "Incident Count by Grade", x = "State", y = "School Shootings")+
scale_fill_viridis_d( name = "Grade", direction = -1)+
scale_color_viridis_d( guide = FALSE, direction = -1)+
theme_bw()+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 3.5), legend.position = c(0.7,.15), legend.direction = "horizontal",legend.key.size = unit(1.5,"line"), legend.text = element_text(size = 8), legend.title = element_text(size = 10),plot.title = element_text(hjust = 0.5))

### Outcome plot:
ss_counts%>%
group_by(year)%>%
summarise(ss = sum(incident_count))%>%
ggplot(aes(x = year, y = ss))+
geom_line(color = "red")+
labs(x = "Year", y = "School Shootings", title = "School Shootings in the U.S.")+
theme_bw()+
theme(plot.title = element_text(hjust = 0.5))

ss_counts%>%
ggplot(aes(x = reorder(state, population, FUN = mean), y = incident_count, color = state))+
geom_boxplot()+
theme_bw()+
coord_flip()+
theme(legend.position = "none")

grades = left_join(grades, gpa_convert, by = c("grade"="letter_grade"))
dat_loc= map_data("state")%>%
mutate(sts=toupper(region))
b=full_join(grades%>%mutate(state = tolower(state)), dat_loc, by=c("state"="region"))
b = b%>%mutate(year = as.numeric(year))%>%filter(!is.na(year))%>%select(-subregion)
b = b[complete.cases(b),]
map_plts = ggplot(data = b, aes(x = long, y = lat, group = group))+
geom_polygon(aes(fill = gpa), color = "white")+
scale_fill_viridis(option = "D", name = "GPA")+
labs(title = "Grade Changes by Year", subtitle = 'Year: {frame_time}', x = NULL, y = NULL)+
theme_bw()+
theme(line = element_blank(), axis.text.x = element_blank(), axis.text.y = element_blank(), plot.title = element_text(hjust = 0.5))+
transition_time(year, range = c(2012,2018))
animate(map_plts, nframes = 7, fps = 1)

anim_save("state_grade_animation.GIF")
dat_loc = left_join(dat_loc,states%>%mutate(state.name = tolower(state.name)), by = c("region"="state.name"))
b2=full_join(ss_counts%>%ungroup()%>%filter(year>=2007)%>%filter(!(state%in%c("AK","HI")))%>%select(state,year,access_laws,transport,screening,ban), dat_loc, by=c("state"="state.abb"))%>%
select(-subregion)%>%
gather(key = "law_class", value = "number_in_effect", c(3:6))
b2 = b2[complete.cases(b2),]
law.labs <- c("Access Laws", "Bans","Screening Laws", "Transport Laws")
names(law.labs) <- c("access_laws", "ban","screening","transport")
map_plts_law = ggplot(data = b2, aes(x = long, y = lat, group = group))+
geom_polygon(aes(fill = number_in_effect), color = "white")+
scale_fill_viridis(option = "D", name = "Laws in\nEffect\n(R-P)")+
labs(title = "Law Changes by Year",subtitle = 'Year: {frame_time}', x = NULL, y = NULL)+
theme_bw()+
facet_wrap(.~law_class, labeller = labeller(law_class = law.labs))+
theme(line = element_blank(), axis.text.x = element_blank(), axis.text.y = element_blank(), plot.title = element_text(hjust = 0.5))+
transition_time(year, range = c(2007,2019))
animate(map_plts_law, nframes = 13, fps = 1)

anim_save("state_law_animation.GIF")
ss_counts%>%
filter(year %in% seq(2012,2018,by=1))%>%
filter(!is.na(grade))%>%
ggplot(aes(x = incident_count , fill = grade))+
geom_histogram(binwidth = 1)+
facet_wrap(.~year)+
scale_fill_manual(values = cc)+
scale_x_continuous(breaks = seq(0,35,by=5))+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 3.5))+
theme_bw()+
labs(x= "Number of School Shootings")

ss_counts%>%
filter(year %in% seq(2012,2018,by=1))%>%
filter(!is.na(grade))%>%
ggplot(aes(x = grade, y = incident_count, color = grade))+
geom_boxplot()+
facet_wrap(.~year)+
theme_bw()+
scale_color_manual(values = cc)+
labs(y= "Number of School Shootings", x = "Grade")

ss_counts%>%
filter(year %in% seq(2012,2018,by=1))%>%
ggplot()+
geom_bar(aes(x = reorder(state,population), y = incident_count), stat = "identity")+
geom_point(aes(x = state, y = log(population), color = "population"), size = .4)+
theme_bw()+
labs(x = "State", y = "Number of School Shootings", title = "Number of School Shooting and Population by State")+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 3.5), legend.position = c(.7, 0), legend.justification = c(1, 0))+
facet_wrap(.~year)+
scale_color_discrete(name = NULL, labels = "Population (log)")

ss_counts%>%
filter(year %in% seq(2012,2018,by=1))%>%
group_by(state) %>%
mutate(ordr = mean(avg_victims, na.rm = T))%>%
filter(ordr!="NaN")%>%
ungroup()%>%
arrange(desc(ordr))%>%
ggplot(aes(x = reorder(state,ordr), y = avg_victims, color = year))+
geom_point(size = 0.6, position = "jitter")+
labs(x = "State", y = "Average number of victims per incident", title ="Victims per School Shooting")+
scale_color_viridis(option = "D")+
theme_bw()+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6))

Media Plots
media%>%
ggplot(aes(x = prev_month_art, y = mass_shooting, color = as.numeric(year)))+geom_point(position = "jitter")+
scale_color_continuous(name = "Year")+
labs(y = "Mass Shooting Count", x = "Articles in Previous Month")+
theme_bw()

media%>%
mutate(date = as.Date(zoo::as.yearmon(paste(year,"-",month, sep = ""))))%>%
ggplot(aes(y = articles_shootings_excluding_firearm_laws_and_regulations, x= date, color = year))+
geom_point()+
geom_jitter()+
labs(y = "Monthly Articles",x = "Year", title = "Media Coverage of Shootings", subtitle = "Articles in NYT and Washington Post Regarding Shootings")+
scale_color_continuous(name = "Year")+
theme_bw()+
theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5, size = 8))

media%>%
ggplot(aes(y = articles_shootings_excluding_firearm_laws_and_regulations/yearly_shootings, x= as.numeric(year)))+
geom_point()+
geom_jitter()+
labs(y = "Monthly Articles/Yearly Mass Shootings",x = "Year")+
scale_color_continuous(name = "Year")+
theme_bw()

media%>%
ggplot(aes(y = yearly_articles, x= as.numeric(year)))+
geom_point()+
labs(y = "Yearly Articles",x = "Year")+
theme_bw()

school_counts = ss%>%
select(school,city,state)%>%
unite(col = school_city,c(school,city,state), sep = "_")%>%
table(dnn = c("school","count"))%>%
data.frame()%>%
separate(col = school_city, into = c("school","city","state"), sep = "_" )
delta = data.frame(state = NULL, year = NULL, delta_gpa = NULL, delta_ss = NULL, delta_pop = NULL)
for(x in unique(ss_counts$state)){
dat = filter(ss_counts, state == x)%>%
arrange(year)
chng_gpa = c(NA,diff(dat$gpa))
chng_ss = c(NA, diff(dat$incident_count))
chng_pop = c(NA, diff(dat$population))
st = rep(x, nrow(dat))
newrows =data.frame(state = st, year = dat$year, delta_gpa = chng_gpa, delta_ss = chng_ss, delta_pop= chng_pop)
delta = bind_rows(delta,newrows)
}
ss_counts%>%
group_by(year)%>%
summarise(yearly_incidents = sum(incident_count), prev_year_art = prev_year_art[1] )%>%
filter(year %in% seq(1999,2017, by = 1))%>%
ggplot(aes(x = year, y = yearly_incidents, fill = prev_year_art))+geom_bar(stat = "identity")+
labs(y = "Yearly School Shootings", x = "Year")+
scale_fill_viridis(option = "D", name = "Articles in previous year")+
theme_bw()

Bullying survey plots
ggplot(data= bullying_survey%>%filter(year>=2013), aes(x = year, y = prop, grouby_by = state, color = variable))+
geom_line(position = "jitter")+
scale_color_viridis_d()+
theme_bw()

ggplot(data= bullying_survey%>%filter(variable %in% qns[9:17]), aes(x = year, y = prop, color = variable))+
geom_point(position = "jitter")+
geom_smooth(se=F)+
scale_color_viridis_d()+
theme_bw()

lab = data.frame(question = yrbss::yrbss_questions_binary$variable, label = yrbss::yrbss_questions_binary$label)%>%
filter(question %in% qns)
kableExtra::kable(lab, byrow = F, caption = "Questions topics")
Questions topics
|
question
|
label
|
|
qn13
|
Carried a weapon
|
|
qn14
|
Carried a gun
|
|
qn15
|
Carried a weapon on school property
|
|
qn16
|
Did not go to school because they felt unsafe at school or on their way to or from school
|
|
qn17
|
Were threatened or injured with a weapon on school property
|
|
qn18
|
Were in a physical fight
|
|
qn19
|
Were injured in a physical fight
|
|
qn20
|
Were in a physical fight on school property
|
|
qn24
|
Were bullied on school property
|
|
qn25
|
Were electronically bullied
|
|
qn26
|
Felt sad or hopeless
|
|
qn27
|
Seriously considered attempting suicide
|
|
qn28
|
Made a plan about how they would attempt suicide
|
|
qn29
|
Attempted suicide
|
|
qn30
|
Attempted suicide that resulted in an injury, poisoning, or overdose that had to be treated by a doctor or nurse
|
|
qnbullyweight
|
Been teased b/c of weight past 12 mos
|
|
qnbullygay
|
Been teased b/c labeled GLB past 12 mos
|
bullying_survey%>%
slice(which(complete.cases(bullying_survey)))%>%
select(label)%>%
table()%>%
data.frame()%>%
kableExtra::kable(caption = "Observations per Question")
Observations per Question
|
.
|
Freq
|
|
Attempted suicide
|
298
|
|
Attempted suicide that resulted in an injury, poisoning, or overdose that had to be treated by a doctor or nurse
|
282
|
|
Been teased b/c labeled GLB past 12 mos
|
18
|
|
Been teased b/c of weight past 12 mos
|
18
|
|
Carried a gun
|
231
|
|
Carried a weapon
|
275
|
|
Carried a weapon on school property
|
292
|
|
Did not go to school because they felt unsafe at school or on their way to or from school
|
298
|
|
Felt sad or hopeless
|
251
|
|
Made a plan about how they would attempt suicide
|
291
|
|
Seriously considered attempting suicide
|
308
|
|
Were bullied on school property
|
124
|
|
Were electronically bullied
|
94
|
|
Were in a physical fight
|
298
|
|
Were in a physical fight on school property
|
294
|
|
Were injured in a physical fight
|
270
|
|
Were threatened or injured with a weapon on school property
|
291
|
surv.labs = c("Bullying Score", "Physical Altercation Score", "School Danger Perception Score", "Suicidal Thoughts or Actions Score")
names(surv.labs)= c("bully_score", "fight_score", "school_safety_score","suicide_score")
survey_combined%>%
gather(key = score_type, percent, c(3,5,7,9))%>%
ggplot(aes(x = state, y = percent, color = score_type ))+
geom_boxplot()+
scale_color_viridis_d()+
labs(title = "Survey Reponses by State", x = "State", y = "Percent Answering 'Yes'")+
facet_wrap(.~score_type,labeller = labeller(score_type = surv.labs))+
theme_bw()+
theme(legend.position = "none")+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 4.5))

Model Fitting
fit_grade = glm(incident_count~gpa+poverty+bullying_score+ prev_year_art+year+suicide_score+fight_score+school_safety_score+bully_score+ offset(log(population)), family = "poisson", data = ss_counts)
summary(fit_grade)
##
## Call:
## glm(formula = incident_count ~ gpa + poverty + bullying_score +
## prev_year_art + year + suicide_score + fight_score + school_safety_score +
## bully_score + offset(log(population)), family = "poisson",
## data = ss_counts)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.0430 -0.8783 -0.4997 0.4867 2.2694
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.360e+01 1.177e+02 -0.625 0.532
## gpa -3.189e-01 7.065e-02 -4.513 6.38e-06 ***
## poverty -3.955e-02 3.050e-02 -1.297 0.195
## bullying_score 8.407e-02 5.133e-02 1.638 0.101
## prev_year_art 5.163e-04 3.360e-04 1.537 0.124
## year 2.831e-02 5.842e-02 0.485 0.628
## suicide_score 3.477e+00 4.898e+00 0.710 0.478
## fight_score -3.526e+00 2.612e+00 -1.350 0.177
## school_safety_score -3.702e+00 3.999e+00 -0.926 0.355
## bully_score 1.762e+00 3.442e+00 0.512 0.609
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 314.90 on 299 degrees of freedom
## Residual deviance: 266.23 on 290 degrees of freedom
## (2200 observations deleted due to missingness)
## AIC: 590.55
##
## Number of Fisher Scoring iterations: 5
fit_law = glm(reformulate(names(ss_counts)[c(23,25,26,28,30,32,34,36:39)],names(ss_counts)[3]), offset=log(population), family = "poisson", data = ss_counts)
summary(fit_law)
##
## Call:
## glm(formula = reformulate(names(ss_counts)[c(23, 25, 26, 28,
## 30, 32, 34, 36:39)], names(ss_counts)[3]), family = "poisson",
## data = ss_counts, offset = log(population))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4926 -0.8152 -0.5216 0.3660 3.0823
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.568e+01 1.083e+00 -14.472 < 2e-16 ***
## prev_year_art 5.114e-04 2.232e-04 2.291 0.02196 *
## poverty -3.079e-02 2.245e-02 -1.372 0.17011
## bullying_score 6.396e-02 4.207e-02 1.520 0.12839
## suicide_score 7.534e+00 4.097e+00 1.839 0.06593 .
## school_safety_score -4.671e+00 3.754e+00 -1.244 0.21338
## fight_score -5.600e+00 2.007e+00 -2.791 0.00526 **
## bully_score -2.604e+00 2.899e+00 -0.898 0.36909
## access_laws -7.306e-02 2.704e-02 -2.701 0.00690 **
## transport 8.518e-02 5.182e-02 1.644 0.10026
## screening 3.969e-03 2.895e-02 0.137 0.89096
## ban -2.183e-01 1.205e-01 -1.811 0.07009 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 560.79 on 549 degrees of freedom
## Residual deviance: 485.60 on 538 degrees of freedom
## (1950 observations deleted due to missingness)
## AIC: 966.03
##
## Number of Fisher Scoring iterations: 5
z_fit = pscl::zeroinfl(reformulate(names(ss_counts)[c(23,25,26,28,30,32,34,36:39)],names(ss_counts)[3]), offset=log(population), dist = "poisson", data = ss_counts)
summary(z_fit)
##
## Call:
## pscl::zeroinfl(formula = reformulate(names(ss_counts)[c(23, 25,
## 26, 28, 30, 32, 34, 36:39)], names(ss_counts)[3]), data = ss_counts,
## offset = log(population), dist = "poisson")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -1.7698 -0.5790 -0.3020 0.2628 4.0157
##
## Count model coefficients (poisson with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.608e+01 1.110e+00 -14.488 < 2e-16 ***
## prev_year_art 1.859e-04 2.121e-04 0.876 0.38091
## poverty -5.386e-02 2.348e-02 -2.294 0.02182 *
## bullying_score 4.507e-02 4.285e-02 1.052 0.29281
## suicide_score 8.400e+00 4.207e+00 1.997 0.04584 *
## school_safety_score 1.711e+00 4.107e+00 0.416 0.67707
## fight_score -6.332e+00 2.023e+00 -3.130 0.00175 **
## bully_score 1.959e+00 3.127e+00 0.626 0.53104
## access_laws -7.996e-02 2.706e-02 -2.955 0.00313 **
## transport 1.178e-01 5.404e-02 2.180 0.02928 *
## screening -1.606e-02 2.982e-02 -0.538 0.59031
## ban -9.321e-02 1.222e-01 -0.763 0.44549
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -30.45079 22.52488 -1.352 0.1764
## prev_year_art -0.02772 0.01508 -1.838 0.0660 .
## poverty -0.47080 0.36127 -1.303 0.1925
## bullying_score -0.51831 0.40051 -1.294 0.1956
## suicide_score 119.99216 69.81387 1.719 0.0857 .
## school_safety_score 105.54272 77.51072 1.362 0.1733
## fight_score -26.23736 36.94041 -0.710 0.4775
## bully_score 185.21430 123.72378 1.497 0.1344
## access_laws -0.59825 0.52271 -1.145 0.2524
## transport -1.18048 1.09534 -1.078 0.2812
## screening -1.83901 1.15038 -1.599 0.1099
## ban 5.92543 4.21001 1.407 0.1593
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Number of iterations in BFGS optimization: 295
## Log-likelihood: -453.8 on 24 Df
fit_law_victims = MASS::glm.nb(total_victims ~ prev_year_art + poverty + bullying_score + suicide_score + school_safety_score + fight_score + bully_score + access_laws +
transport + screening + ban+ offset(log(population)), data = ss_counts)
summary(fit_law_victims)
##
## Call:
## MASS::glm.nb(formula = total_victims ~ prev_year_art + poverty +
## bullying_score + suicide_score + school_safety_score + fight_score +
## bully_score + access_laws + transport + screening + ban +
## offset(log(population)), data = ss_counts, init.theta = 2.07623449,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4293 -0.8294 -0.2343 0.4273 3.4255
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.727e+01 1.330e+00 -12.985 < 2e-16 ***
## prev_year_art -4.689e-05 2.866e-04 -0.164 0.870040
## poverty -8.704e-02 2.928e-02 -2.973 0.002946 **
## bullying_score 2.294e-02 5.089e-02 0.451 0.652171
## suicide_score 1.657e+01 5.172e+00 3.205 0.001353 **
## school_safety_score -8.095e+00 5.805e+00 -1.395 0.163160
## fight_score 7.175e+00 2.539e+00 2.826 0.004710 **
## bully_score 6.918e+00 3.845e+00 1.799 0.071985 .
## access_laws -1.184e-01 3.175e-02 -3.729 0.000192 ***
## transport 1.820e-01 7.025e-02 2.591 0.009577 **
## screening 7.706e-02 3.482e-02 2.213 0.026875 *
## ban -3.372e-01 1.499e-01 -2.250 0.024454 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(2.0762) family taken to be 1)
##
## Null deviance: 248.49 on 195 degrees of freedom
## Residual deviance: 195.50 on 184 degrees of freedom
## (2304 observations deleted due to missingness)
## AIC: 802.79
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 2.076
## Std. Err.: 0.370
##
## 2 x log-likelihood: -776.795
1-pchisq(195.70 , df = 184)
## [1] 0.2636877
plot(DHARMa::simulateResiduals(fit_law))

if(FALSE){
ts_dat = ss_counts%>%
ungroup()%>%
select(-c(4:17,23:24,27))%>%
select(-starts_with("n"))%>%
filter(year>=2007)%>%
filter(year<2019)
ts_y = ts(ts_dat%>%ungroup()%>%select(incident_count))
ts_x = ts_dat%>%
select(c(6,9:18))%>%
mutate(population = log(population))%>%
as.matrix()
ts_fit = tsglm(ts_y, xreg = ts_x, model = list(past_obs = 2, external = c(FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,TRUE,FALSE,TRUE,TRUE)), link = "log", distr = "poisson")
best_fit = forecast::auto.arima(diff(ts_y), xreg = diff(ts_x))
astsa::sarima(diff(ts_y),p = 2, d = 0, q =2, xreg = diff(ts_x))
arima(diff(ts_y),order = c(2,0,2), xreg = diff(ts_x))$coef
sqrt(diag(vcov( arima(diff(ts_y),order = c(0,0,0), xreg = diff(ts_x)))))
cilb= arima(diff(ts_y),order = c(0,0,0), xreg = diff(ts_x))$coef-sqrt(diag(vcov( arima(diff(ts_y),order = c(0,0,0), xreg = diff(ts_x)))))
ciub = arima(diff(ts_y),order = c(0,0,0), xreg = diff(ts_x))$coef+sqrt(diag(vcov( arima(diff(ts_y),order = c(0,0,0), xreg = diff(ts_x)))))
data.frame(Lower= cilb, Upper = ciub)
}